*---------------------------------------------------
cap clear mata
cap clear
capture log close
program drop _all
macro drop _all
version 10.0
set mem 700m
set mat 2000
set more off
cd "E:\REStat_MS14767_Vol96(2)\Data preparation Compustat"
log using "estimation_sample.log", replace
*---------------------------------------------------------------


********************************************************************************
********      in this file we create the final estimation sample  **************
********************************************************************************

use "compustat_master_america.dta", clear
so ticker year
keep  ticker vantagekey name countryinc employees sales forn_sal ta year SICALL2-SICALL15 SIC4 CUSIP rdexpense mkvalq
rename CUSIP cusip

gen SIC2=int(SIC4/100)
label var SIC2 "SIC4/100"



********************************************************
*****          here we add the variables related to RJV participation (see RJVpartic.do)
********************************************************

merge ticker year using RJVvars.dta
tab _merge
drop _merge

********************************************************
*****          here we add the variables related to links(see links.do)
********************************************************

so ticker year
merge ticker year using links.dta
tab _merge
drop _merge


********************************************************
*****           here we add the variables related to patents (see patentvars.do)
********************************************************

so cusip year
merge cusip year using "patentvars.dta"
tab _merge
drop _merge

************************************************************
*** 		some other patent vars are added (see patentpanel.do)
************************************************************

so ticker year
merge ticker year using "patentpanel.dta"
tab _merge
drop if _merge==2
drop _merge

************************************************************
*** Remark: Don't care about observations with _merge==2,
*** they are due to different time intervals, not to different
*** firms. That is, patent data starts already in 1969 whereas
*** Compustat starts in 1986. 
**************************************************************


**********************************************************
*****           WE KEEP ONLY AMERICAN FIRMS          *****
*****             FOR WHICH WE HAVE SALES            *****
**********************************************************

count if countryinc!=0

* sales are changed into sales_new by substracting forn_sales

replace forn_sales=0 if  forn_sales==.
gen sales_new=sales- forn_sales
count if sales_new<0
replace sales_new=. if sales_new<0
count if sales_new==.
drop if sales_new==.
label var sales_new "Sales - foreign sales"

compress
drop countryinc vantagekey sales forn_sales



********************* we correct some variables  ****************
********************* and generate some others     ****************

so ticker year
egen firmnum=group(ticker)
label var firmnum "group(ticker)"
egen ncomp=count(firmnum), by(SIC4 year)
label var ncomp "number of competitors per year/SIC4"

* this is just to control that there are no mistake in the panel
* if everythnig is correct no observation should be dropped!!
drop if firmnum==firmnum[_n-1]& year==year[_n-1]


********************************************************
*****           
*****           here we define market shares (our dep. var.) defining the market as the SIC4 industry
*****           
********************************************************

egen tot_sales=sum(sales_new), by (SIC4 year)
label var tot_sales "sum of sales by SIC4, year"
gen MS=sales_new/tot_sales
label var MS "Market shares firm/year at the SIC4 level"

gen aa=MS*MS
egen HHI=sum(aa), by(SIC4 year)
label var HHI "HHI index at the SIC4 level"
drop aa


********************************************************
*****           
*****           NETWORK
*****           
********************************************************


replace links1_tot=0 if  links1_tot==.
replace links2_tot=0 if  links2_tot==.
replace links2=0 if links2==.
replace links2_same=0 if links2_same==.
replace links2_diff=0 if links2_diff==.

gen links1_ver= links1_tot-links1
label var links1_ver "Pure vertical links (i.e. links with non-competitors) with double counting"
gen links2_ver= links2_tot-links2
label var links2_ver "Pure vertical links (i.e. links with non-competitors) without double counting"


gen network=links2/(ncomp-1)
label var network "links2/competitors"

* we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)
* this is because we have 11 firms in the RJV that have no sales and we dropped these observations
* these firms are: DEC	DGN.	FMC	GCHI	HLN	HPQ	IDL.1	LCE	MSA.1	NSTS	SDW

replace network=1 if links2>ncomp-1
replace network=0 if network==.

gen network2=network*network
label var network2 "network squared"


gen network_ver=links2_ver/(ncomp-1)
label var network "links2_ver/competitors"

gen network_ver2=network_ver*network_ver
label var network "network_ver squared"

		**** dividing between RJV in the same SIC industry 
		**** and in different SIC industry

gen network_s=links2_same/(ncomp-1)
label var network_s "same ind. links2/competitors"

* we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)

replace network_s=1 if links2_same>ncomp-1
replace network_s=0 if network_s==.

gen network_s2=network_s*network_s
label var network_s2 "network_s squared"

gen network_d=links2_diff/(ncomp-1)
label var network_d "diff. ind. links2/competitors"

* we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)
replace network_d=1 if links2_diff>ncomp-1
replace network_d=0 if network_d==.

gen network_d2=network_d*network_d
label var network_d2 "network_d squared"




********************************************************
*****           
*****           RJV in same/different industry
*****           
********************************************************

drop  RJVsame RJVdiff
gen RJVsame=0
replace RJVsame=1 if links2>0
label var RJVsame "=1 if the firm meets at least one competitior in the RJV(s)"

gen RJVdiff=0
replace RJVdiff= RJV-RJVsame if RJV>0
label var RJVdiff "=1 if the firm does not meet any competitior in the RJV(s)"

gen lRJVtotal=log(RJVtotal+1)
label var lRJVtotal "log(RJVtotal+1)"



********************************************************
*****           
*****           here we generate some additional variables
*****           
********************************************************

use "E:\REStat_MS14767_Vol96(2)\Estimation\estimation_sample", clear
so ticker year
merge ticker year using "networkMS.dta"
drop _merge
so ticker year
merge ticker year using "asset.dta"
drop if _merge==2

gen lta=log(ta+1)
label var ta "Log total assents in millions dollars"
drop _merge

* here we add the infos about the network built on the largest RJV (max_links)

so ticker year

merge ticker year using "max_links.dta"

gen network_n=links2_n/(ncomp-1)
gen network_max=max_links2/(ncomp-1)
gen network_av=network/RJVsametot
gen network_av_n=network_n/RJVsametot

drop _merge
so SIC2 year
merge SIC2 year using "import_dataSIC2.dta"


**** We create a new measure of MS to weight the MS of those (conglomerate) firms which are in several SICs
* one possibility for weighting is to give each first sic code a weight
* of 2/(n+1), and subsequent sics would get either each 1/(n+1) or some
* further scaling down, but what weight exactlz we would give to subsequent
* ones is at this point not so relevant, since we would give them now zero.
* one could give the argumentation that sics are not ranked according to
* importance, only the first one (which i think is true)

egen SICn=rownonmiss(SIC4 SICALL2 SICALL3 SICALL4 SICALL5 SICALL6 SICALL7 SICALL8 SICALL9 SICALL10 SICALL11 SICALL12 SICALL13 SICALL14 SICALL15)
gen MSw=MS*2/(1+SICn)

* here we first define those industries where no RJV was formed
* these observations will represent the "zeros"

gen aa=RJV
egen bb=max(aa), by(SIC4)
replace bb=0 if bb==.
gen zeros=0
replace zeros=1 if bb==0
drop aa bb

gen aa=RJVsame
egen bb=max(aa), by(SIC4)
gen zero_same=0
replace zero_same=1 if bb==0
drop aa bb

replace patents=0 if patents==.
replace patents_t_1=0 if patents_t_1==.
replace patents_t_2=0 if patents_t_2==.
replace patents_t_3=0 if patents_t_3==.
replace patents_t_4=0 if patents_t_4==.


gen only_ver=0
replace only_ver=1 if network==0 & RJV==1


* here we redefine the RJV dummies

gen RJV2=RJV
replace RJV2=0 if RJV2==.

* horizontal vs. vertical

gen RJVsame2=RJVsame
replace RJVsame2=0 if RJVsame2==.
rename RJVsame2 RJVhor
label var RJVhor "RJV with firms from the same industry"


gen RJVdiff2=RJVdiff
replace RJVdiff2=0 if RJVdiff==.
rename RJVdiff2 RJVver
label var RJVver "RJV without firms from the same industry"

* horizontal in the same industry vs. different industry

gen RJVsame2=0
replace RJVsame2=1 if network_s>0
label var RJVsame2 "RJV with competitors in the same industry"

gen RJVdiff2=0
replace RJVdiff2=1 if RJVsame2==0 & RJVhor==1
label var RJVsame2 "RJV with competitors in different industries"


* we replace the missing with zeros in the r&d variable 

replace ta=0 if ta==.
gen rd2=rdexpense
replace rd2=0 if rd2==. | rd2<0
gen lrd2=log(rd2+1)

* we create a R&D intensity variable
gen rd_int=rd2/sales


* we build a r&d stock

so ticker year
by ticker: gen yearweight=year[_n+1]-year if year!=year[_n+1]
gen xxx=rdexpense
replace xxx=0 if xxx==.
so ticker year
by ticker: gen rd_stock=xxx+xxx[_n-1]*(1-0.15)^yearweight if year!=year[_n-1]
label var rd_stock "r&d stock, depreciated"
drop xxx
replace rd_stock=rd_stock[_n-1] if year==year & ticker==ticker[_n-1] & rd_stock==.
replace rd_stock=0 if rd_stock==.
so ticker year

replace rd_stock=0 if rd_stock<0
gen lrd_stock=log(rd_stock+1)


* we generate industry specific variables

egen m_rd=mean(rd2), by(SIC4 year)
label var m_rd "Mean R&D exp. at the SIC4/year level

egen m_lrd=mean(lrd2), by(SIC4 year)
label var m_rd "Mean log R&D exp. at the SIC4/year level

egen m_rdexpense=mean(rdexpense), by(SIC4 year)
label var m_rdexpense "Mean R&D exp. at the SIC4/year level"

gen lmkval=log(mkvalq)
egen m_lmkval=mean(lmkval), by(SIC4 year)
label var m_lmkval "Mean log MV at the SIC4/year level"

egen m_mkval=mean(mkval), by(SIC4 year)
label var m_mkval "Mean MV at the SIC4/year level"


*gen lmkval=log(mkvalq)
gen lmkval2=lmkval
replace lmkval2=0 if lmkval==.

egen m_lmkval2=mean(lmkval2), by(SIC4 year)
label var m_lmkval "Mean log MV at the SIC4/year level

gen only_horiz=0
replace only_horiz=1 if network>0

egen nRJVfirm_hor=sum(only_horiz), by(SIC4 year)
gen coverage= nRJVfirm_hor/ncom
replace coverage=0 if network==0
gen coverage2=coverage*coverage

gen only_horiz_s=0
replace only_horiz_s=1 if network_s>0


*** we generate the vertical links

replace links2_tot=0 if links2_tot==.
gen links2_ver=links2_tot-links2_same-links2_diff
gen links_ver=links2_ver
*label var RJVeff2 "Heterogenous effect (RJV dummies)"
*label var RJVeff_s2 "Heterogenous effect (RJV dummies)"
label var network "RJV network coverage"
label var network_s "RJV network coverage"
label var network_d "RJV network coverage"
label var networkMS "RJV network coverage in MS"
label var networkMS_s "RJV network coverage in MS"
label var networkMS_d "RJV network coverage in MS"
label var RJV2 "RJV"
label var RJVver "Vertical RJV"
label var RJVhor "Horizontal RJV"
label var links_ver "Vertical links"
label var lrd2 "log(R&D)"
label var m_lmkval "log(MV)"
label var m_lmkval "log(MV) SIC4"
label var m_lrd "log(R&D) SIC4"

gen RJVhor_s=0
replace RJVhor_s=1 if network>0 & network<.031746

gen RJVhor_m=0
replace RJVhor_m=1 if network>=.031746 & network<.1948052

gen RJVhor_l=0
replace RJVhor_l=1 if network>=.1948052

keep sales_new ta rd2 rd_int patents network network2 networkMS coverage RJV RJV2 RJVhor RJVver RJVhor_s RJVhor_m RJVhor_l RJVsametot links2_ver m_rdexpense  m_lmkval2 ncomp lrd2 m_lmkval m_lrd patents_t_1 patents_t_2 patents_t_3 lta lrd2 m_lrd m_lmkval SIC4 firmnum year MS zero_same dy* ticker MSw
save RESTATestimation_sample_final.dta, replace

log close
